Spaces:

soundsauce
/

soundsauce-old

Build error

App Files Files Community

mattricesound commited on Jul 28, 2023

Commit

e316ba7

•

1 Parent(s): c2a47f9

Remove unecessary components. Add back in examples. Add ability to swap local model for using musicgen gradio api

Browse files

Files changed (1) hide show

app.py +113 -108

app.py CHANGED Viewed

@@ -16,29 +16,35 @@ from tempfile import NamedTemporaryFile
 import time
 import typing as tp
 import warnings
 import torch
 import gradio as gr
 from audiocraft.data.audio_utils import convert_audio
-from audiocraft.data.audio import audio_write
 from audiocraft.models import MusicGen
 from demucs import pretrained
 from demucs.apply import apply_model
 from demucs.audio import convert_audio
 MODEL = None  # Last used model
 DEMUCS_MODEL = None
 MAX_BATCH_SIZE = 12
 INTERRUPTING = False
 # We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
 _old_call = sp.call
 stem2idx = {'drums': 0, 'bass': 1, 'other': 2, 'vocal': 3}
 stem_idx = torch.LongTensor([stem2idx['vocal'], stem2idx['other'], stem2idx['bass']])
 def _call_nostderr(*args, **kwargs):
@@ -94,14 +100,19 @@ def make_waveform(*args, **kwargs):
 def load_model(version='melody'):
     global MODEL, DEMUCS_MODEL
-    print("Loading model", version)
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    if MODEL is None or MODEL.name != version:
-        # If gpu is not available, we'll use cpu.
-        MODEL = MusicGen.get_pretrained(version, device=device)
     if DEMUCS_MODEL is None:
         DEMUCS_MODEL = pretrained.get_model('htdemucs').to(device)
 def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
     MODEL.set_generation_params(duration=duration, **gen_kwargs)
@@ -149,31 +160,19 @@ def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
         demucs_output = demucs_output.cpu()
         # Naming
-        filename = f"temp/{texts[0][:10]}.wav"
-        d_filename = f"temp/{texts[0][:10]}_no_drums.wav"
         # If path exists, add number. If number exists, update number.
         i = 1
-        while Path(filename).exists():
-            filename = f"{texts[0][:10]}_{i}.wav"
-            d_filename = f"{texts[0][:10]}_{i}_no_drums.wav"
             i += 1
-        # with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
-        audio_write(
-            filename, output, MODEL.sample_rate, strategy="loudness",
-            loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
-        # out_files.append(pool.submit(make_waveform, filename))
-        out_files.append(filename)
-        file_cleaner.add(filename)
-    # with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
         audio_write(
             d_filename, demucs_output, MODEL.sample_rate, strategy="loudness",
             loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
         out_files.append(d_filename)
-        # out_files.append(pool.submit(make_waveform, d_filename))
         file_cleaner.add(d_filename)
-    # res = [out_file.result() for out_file in out_files]
     res = [out_file for out_file in out_files]
     for file in res:
         file_cleaner.add(file)
@@ -183,18 +182,10 @@ def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
-def predict_full(text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
     global INTERRUPTING
     INTERRUPTING = False
-    if temperature < 0:
-        raise gr.Error("Temperature must be >= 0.")
-    if topk < 0:
-        raise gr.Error("Topk must be non-negative.")
-    if topp < 0:
-        raise gr.Error("Topp must be non-negative.")
-    topk = int(topk)
     def _progress(generated, to_generate):
         progress((generated, to_generate))
         if INTERRUPTING:
@@ -202,77 +193,114 @@ def predict_full(text, melody, duration, topk, topp, temperature, cfg_coef, prog
     MODEL.set_custom_progress_callback(_progress)
     outs = _do_predictions(
-        [text], [melody], duration, progress=True,
-        top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
-    return outs[0], outs[1], outs[0], outs[1]
-def toggle_audio_src(choice):
-    if choice == "mic":
-        return gr.update(source="microphone", value=None, label="Microphone")
-    else:
-        return gr.update(source="upload", value=None, label="File")
 def ui_full(launch_kwargs):
     with gr.Blocks() as interface:
         with gr.Row():
             with gr.Column():
                 with gr.Row():
                     text = gr.Text(label="Input Text", interactive=True)
                     with gr.Column():
-                        radio = gr.Radio(["file", "mic"], value="file",
-                                         label="Condition on a melody (optional) File or Mic")
-                        melody = gr.Audio(source="upload", type="numpy", label="File",
-                                          interactive=True, elem_id="melody-input")
                 with gr.Row():
                     submit = gr.Button("Submit")
                     # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
-                    _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
-                with gr.Row():
-                    duration = gr.Slider(minimum=1, maximum=120, value=10, label="Duration", interactive=True)
-                with gr.Row():
-                    topk = gr.Number(label="Top-k", value=250, interactive=True)
-                    topp = gr.Number(label="Top-p", value=0, interactive=True)
-                    temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
-                    cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
             with gr.Column():
-                with gr.Row():
-                    # output_normal = gr.Video(label="Generated Music")
-                    output_normal = gr.Audio(label="Generated Music")
-                with gr.Row():
-                    file_download = gr.File(label="Download")
-                with gr.Row():
-                    # output_without_drum = gr.Video(label="Removed drums")
-                    output_without_drum = gr.Audio(label="Removed drums")
-                with gr.Row():
-                    file_download_no_drum = gr.File(label="Download")
-                with gr.Row():
                     gr.Markdown(
                         """
                         Note that the files will be deleted after 10 minutes, so make sure to download!
                         """
                     )
-        submit.click(predict_full,
-                     inputs=[text, melody, duration, topk, topp, temperature, cfg_coef],
-                     outputs=[output_normal, output_without_drum, file_download, file_download_no_drum])
-        radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
-        gr.Markdown(
-            """
-            ### More details
-            The model will generate a short music extract based on the description you provided.
-            The model can generate up to 30 seconds of audio in one pass. It is now possible
-            to extend the generation by feeding back the end of the previous chunk of audio.
-            This can take a long time, and the model might lose consistency. The model might also
-            decide at arbitrary positions that the song ends.
-            **WARNING:** Choosing long durations will take a long time to generate (2min might take ~10min).
-            An overlap of 12 seconds is kept with the previously generated chunk, and 18 "new" seconds
-            are generated each time.
-            """
         )
         interface.queue().launch(**launch_kwargs)
@@ -286,41 +314,18 @@ if __name__ == "__main__":
         default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
         help='IP to listen on for connections to Gradio',
     )
-    parser.add_argument(
-        '--username', type=str, default='', help='Username for authentication'
-    )
-    parser.add_argument(
-        '--password', type=str, default='', help='Password for authentication'
-    )
-    parser.add_argument(
-        '--server_port',
-        type=int,
-        default=0,
-        help='Port to run the server listener on',
-    )
-    parser.add_argument(
-        '--inbrowser', action='store_true', help='Open in browser'
-    )
-    parser.add_argument(
-        '--share', action='store_true', help='Share the gradio UI'
-    )
     args = parser.parse_args()
     launch_kwargs = {}
     launch_kwargs['server_name'] = args.listen
-    if args.username and args.password:
-        launch_kwargs['auth'] = (args.username, args.password)
-    if args.server_port:
-        launch_kwargs['server_port'] = args.server_port
-    if args.inbrowser:
-        launch_kwargs['inbrowser'] = args.inbrowser
-    if args.share:
-        launch_kwargs['share'] = args.share
     # Load melody model
     load_model()
     if not os.path.exists("temp"):
         os.mkdir("temp")
     # Show the interface

 import time
 import typing as tp
 import warnings
+import glob
 import torch
 import gradio as gr
+import numpy as np
 from audiocraft.data.audio_utils import convert_audio
+from audiocraft.data.audio import audio_write, audio_read
 from audiocraft.models import MusicGen
 from demucs import pretrained
 from demucs.apply import apply_model
 from demucs.audio import convert_audio
+from gradio_client import Client
+LOCAL = False
 MODEL = None  # Last used model
 DEMUCS_MODEL = None
 MAX_BATCH_SIZE = 12
 INTERRUPTING = False
+client = None
 # We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
 _old_call = sp.call
 stem2idx = {'drums': 0, 'bass': 1, 'other': 2, 'vocal': 3}
 stem_idx = torch.LongTensor([stem2idx['vocal'], stem2idx['other'], stem2idx['bass']])
+melody_files = glob.glob('clips/**/*.mp3', recursive=True)
 def _call_nostderr(*args, **kwargs):
 def load_model(version='melody'):
     global MODEL, DEMUCS_MODEL
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    if LOCAL:
+        if MODEL is None or MODEL.name != version:
+            print("Loading model", version)
+            # If gpu is not available, we'll use cpu.
+            MODEL = MusicGen.get_pretrained(version, device=device)
     if DEMUCS_MODEL is None:
         DEMUCS_MODEL = pretrained.get_model('htdemucs').to(device)
+def connect_to_endpoint():
+    global client
+    client = Client("https://facebook-musicgen--44zzp.hf.space/")
 def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
     MODEL.set_generation_params(duration=duration, **gen_kwargs)
         demucs_output = demucs_output.cpu()
         # Naming
+        d_filename = f"temp/{texts[0][:10]}.wav"
         # If path exists, add number. If number exists, update number.
         i = 1
+        while Path(d_filename).exists():
+            d_filename = f"temp/{texts[0][:10]}_{i}.wav"
             i += 1
         audio_write(
             d_filename, demucs_output, MODEL.sample_rate, strategy="loudness",
             loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
         out_files.append(d_filename)
         file_cleaner.add(d_filename)
     res = [out_file for out_file in out_files]
     for file in res:
         file_cleaner.add(file)
+def predict_full(text, melody, progress=gr.Progress()):
     global INTERRUPTING
     INTERRUPTING = False
+    print("Running local model")
     def _progress(generated, to_generate):
         progress((generated, to_generate))
         if INTERRUPTING:
     MODEL.set_custom_progress_callback(_progress)
     outs = _do_predictions(
+        [text], [melody], duration=10, progress=True)
+    return outs[0], gr.File.update(value=outs[0], visible=True)
+def select_new_melody():
+    new_melody_file = np.random.choice(melody_files)
+    return gr.update(source="upload", value=new_melody_file)
+def run_remote_model(text, melody):
+    print("Running Audiocraft API model with text", text, "and melody", melody)
+    result = client.predict(
+                    text,	# str  in 'Describe your music' Textbox component
+                    melody,	# str (filepath or URL to file) in 'File' Audio component
+                    fn_index=0
+    )
+    # Naming
+    d_filename = os.path.join("temp", f"{text[:10]}.wav")
+    # If path exists, add number. If number exists, update number.
+    i = 1
+    while Path(d_filename).exists():
+        d_filename = os.path.join("temp", f"{text[:10]}_{i}.wav")
+        i += 1
+    # Convert mp4 to wav, using ffmpeg
+    # ffmpeg -i input.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 output.wav
+    sp.run(["ffmpeg", "-i", result, "-vn", "-acodec", "pcm_s16le", "-ar", "32000", "-ac", "1", d_filename])
+    # Load wav file
+    output, sr = audio_read(d_filename)
+    # Demucs
+    print("Running demucs")
+    wav = convert_audio(output, sr, DEMUCS_MODEL.samplerate, DEMUCS_MODEL.audio_channels)
+    wav = wav.unsqueeze(0)
+    stems = apply_model(DEMUCS_MODEL, wav)
+    stems = stems[:, stem_idx]  # extract stem
+    stems = stems.sum(1)  # merge extracted stems
+    stems = convert_audio(stems, DEMUCS_MODEL.samplerate, 32000, 1)
+    demucs_output = stems[0]
+    output = output.cpu()
+    demucs_output = demucs_output.cpu()
+    audio_write(
+        d_filename, demucs_output, 32000, strategy="loudness",
+        loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
+    file_cleaner.add(d_filename)
+    print("Finished", text)
+    print("Tempfiles currently stored: ", len(file_cleaner.files))
+    return d_filename, gr.File.update(value=d_filename, visible=True)
 def ui_full(launch_kwargs):
     with gr.Blocks() as interface:
+        gr.Markdown(
+            """
+            # Soundsauce Melody Playground
+            """
+        )
         with gr.Row():
             with gr.Column():
                 with gr.Row():
                     text = gr.Text(label="Input Text", interactive=True)
                     with gr.Column():
+                        # previously, type="numpy"
+                        if LOCAL:
+                            audio_type="numpy"
+                        else:
+                            audio_type="filepath"
+                        melody = gr.Audio(type=audio_type, label="File",
+                                          interactive=True, elem_id="melody-input", value="clips/chipmunk.wav")
+                        new_melody = gr.Button("New Melody", interactive=True)
                 with gr.Row():
                     submit = gr.Button("Submit")
                     # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
+                    # _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
             with gr.Column():
+                    output_without_drum = gr.Audio(label="Output")
+                    file_download_no_drum = gr.File(label="Download", visible=False)
                     gr.Markdown(
                         """
                         Note that the files will be deleted after 10 minutes, so make sure to download!
                         """
                     )
+        if LOCAL:
+            submit.click(predict_full,
+                        inputs=[text, melody],
+                        outputs=[output_without_drum, file_download_no_drum])
+        else:
+            submit.click(run_remote_model, inputs=[text, melody], outputs=[output_without_drum, file_download_no_drum])
+        new_melody.click(select_new_melody, outputs=[melody])
+        gr.Examples(
+            fn=predict_full,
+            examples=[
+                ["Enchanting Flute Trills amidst Misty String Section"],
+                ["Gliding Mellotron Strings over Vibrant Phrases"],
+                ["Synth Brass Melody Floating over Airy Wind Chimes"],
+                ["Echoing Electric Guitar Licks with Ethereal Vocal Chops"],
+                ["Rhythmic Acoustic Guitar Licks with Echoing Layers"],
+                ["Whimsical Flute Flourishes in a Mystical Forest Glade"],
+                ["Airy Piccolo Trills accompanied by Floating Harp Arpeggios"],
+                ["Dreamy Harp Glissandos accompanied by Distant Celesta"],
+                ["Hypnotic Synth Pads layered with Enigmatic Guitar Progressions"],
+                ["Enchanting Kalimba Melodies atop Mystical Atmosphere"],
+            ],
+            inputs=[text],
+            outputs=[output_without_drum, file_download_no_drum]
         )
         interface.queue().launch(**launch_kwargs)
         default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
         help='IP to listen on for connections to Gradio',
     )
+    parser.add_argument("--local", action="store_true", help="Run locally instead of using API")
     args = parser.parse_args()
     launch_kwargs = {}
     launch_kwargs['server_name'] = args.listen
+    LOCAL = args.local
     # Load melody model
     load_model()
+    if not LOCAL:
+        connect_to_endpoint()
     if not os.path.exists("temp"):
         os.mkdir("temp")
     # Show the interface