Spaces:

soundsauce
/

soundsauce-old

Build error

App Files Files Community

mattricesound commited on Jul 12, 2023

Commit

d9755fb

•

1 Parent(s): 8180c66

Add demucs output, load melody model on launch

Browse files

Files changed (1) hide show

app.py +29 -68

app.py CHANGED Viewed

@@ -102,7 +102,7 @@ def load_model(version='melody'):
         MODEL = MusicGen.get_pretrained(version, device=device)
-def _do_predictions(texts, melodies, duration, progress=False, drums=True, **gen_kwargs):
     MODEL.set_generation_params(duration=duration, **gen_kwargs)
     print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
     be = time.time()
@@ -135,22 +135,25 @@ def _do_predictions(texts, melodies, duration, progress=False, drums=True, **gen
     out_files = []
     for output in outputs:
         # Demucs
-        if not drums:
-            print("Running demucs")
-            wav = convert_audio(output, MODEL.sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
-            wav = wav.unsqueeze(0)
-            stems = apply_model(demucs_model, wav)
-            stems = stems[:, stem_idx]  # extract stem
-            stems = stems.sum(1)  # merge extracted stems
-            stems = convert_audio(stems, demucs_model.samplerate, MODEL.sample_rate, 1)
-            output = stems[0]
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
             audio_write(
                 file.name, output, MODEL.sample_rate, strategy="loudness",
                 loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
             out_files.append(pool.submit(make_waveform, file.name))
             file_cleaner.add(file.name)
     res = [out_file.result() for out_file in out_files]
@@ -169,7 +172,7 @@ def predict_batched(texts, melodies):
     return [res]
-def predict_full(model, text, melody, duration, topk, topp, temperature, cfg_coef, drums, progress=gr.Progress()):
     global INTERRUPTING
     INTERRUPTING = False
     if temperature < 0:
@@ -180,7 +183,7 @@ def predict_full(model, text, melody, duration, topk, topp, temperature, cfg_coe
         raise gr.Error("Topp must be non-negative.")
     topk = int(topk)
-    load_model(model)
     def _progress(generated, to_generate):
         progress((generated, to_generate))
@@ -190,11 +193,9 @@ def predict_full(model, text, melody, duration, topk, topp, temperature, cfg_coe
     outs = _do_predictions(
         [text], [melody], duration, progress=True,
-        top_k=topk, top_p=topp, temperature=temperature, drums=drums, cfg_coef=cfg_coef)
-    return outs[0]
 def toggle_audio_src(choice):
@@ -219,9 +220,6 @@ def ui_full(launch_kwargs):
                     submit = gr.Button("Submit")
                     # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
                     _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
-                with gr.Row():
-                    model = gr.Radio(["melody", "medium", "small", "large"],
-                                     label="Model", value="melody", interactive=True)
                 with gr.Row():
                     duration = gr.Slider(minimum=1, maximum=120, value=10, label="Duration", interactive=True)
                 with gr.Row():
@@ -229,13 +227,15 @@ def ui_full(launch_kwargs):
                     topp = gr.Number(label="Top-p", value=0, interactive=True)
                     temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
                     cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
-                with gr.Row():
-                    drums = gr.Checkbox(label="Drums", value=True, interactive=True)
             with gr.Column():
-                output = gr.Video(label="Generated Music")
         submit.click(predict_full,
-                     inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef, drums],
-                     outputs=[output])
         radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
         gr.Markdown(
             """
@@ -251,20 +251,6 @@ def ui_full(launch_kwargs):
             An overlap of 12 seconds is kept with the previously generated chunk, and 18 "new" seconds
             are generated each time.
-            We present 4 model variations:
-            1. Melody -- a music generation model capable of generating music condition
-                on text and melody inputs. **Note**, you can also use text only.
-            2. Small -- a 300M transformer decoder conditioned on text only.
-            3. Medium -- a 1.5B transformer decoder conditioned on text only.
-            4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
-            When using `melody`, ou can optionaly provide a reference audio from
-            which a broad melody will be extracted. The model will then try to follow both
-            the description and melody provided.
-            You can also use your own GPU or a Google Colab by following the instructions on our repo.
-            See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
-            for more details.
             """
         )
@@ -304,33 +290,6 @@ def ui_batched(launch_kwargs):
         submit.click(predict_batched, inputs=[text, melody],
                      outputs=[output], batch=True, max_batch_size=MAX_BATCH_SIZE)
         radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
-        # gr.Examples(
-        #     fn=predict_batched,
-        #     examples=[
-        #         [
-        #             "An 80s driving pop song with heavy drums and synth pads in the background",
-        #             "./assets/bach.mp3",
-        #         ],
-        #         [
-        #             "A cheerful country song with acoustic guitars",
-        #             "./assets/bolero_ravel.mp3",
-        #         ],
-        #         [
-        #             "90s rock song with electric guitar and heavy drums",
-        #             None,
-        #         ],
-        #         [
-        #             "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130",
-        #             "./assets/bach.mp3",
-        #         ],
-        #         [
-        #             "lofi slow bpm electro chill with organic samples",
-        #             None,
-        #         ],
-        #     ],
-        #     inputs=[text, melody],
-        #     outputs=[output]
-        # )
         gr.Markdown("""
         ### More details
@@ -389,6 +348,8 @@ if __name__ == "__main__":
     if args.share:
         launch_kwargs['share'] = args.share
     # Show the interface
     if IS_BATCHED:
         ui_batched(launch_kwargs)

         MODEL = MusicGen.get_pretrained(version, device=device)
+def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
     MODEL.set_generation_params(duration=duration, **gen_kwargs)
     print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
     be = time.time()
     out_files = []
     for output in outputs:
         # Demucs
+        print("Running demucs")
+        wav = convert_audio(output, MODEL.sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
+        wav = wav.unsqueeze(0)
+        stems = apply_model(demucs_model, wav)
+        stems = stems[:, stem_idx]  # extract stem
+        stems = stems.sum(1)  # merge extracted stems
+        stems = convert_audio(stems, demucs_model.samplerate, MODEL.sample_rate, 1)
+        demucs_output = stems[0]
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
             audio_write(
                 file.name, output, MODEL.sample_rate, strategy="loudness",
                 loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
+            out_files.append(pool.submit(make_waveform, file.name))
+            file_cleaner.add(file.name)
+        with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
+            audio_write(
+                file.name, demucs_output, MODEL.sample_rate, strategy="loudness",
+                loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
             out_files.append(pool.submit(make_waveform, file.name))
             file_cleaner.add(file.name)
     res = [out_file.result() for out_file in out_files]
     return [res]
+def predict_full(text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
     global INTERRUPTING
     INTERRUPTING = False
     if temperature < 0:
         raise gr.Error("Topp must be non-negative.")
     topk = int(topk)
+    # load_model(model)
     def _progress(generated, to_generate):
         progress((generated, to_generate))
     outs = _do_predictions(
         [text], [melody], duration, progress=True,
+        top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
+    return outs[0], outs[1]
 def toggle_audio_src(choice):
                     submit = gr.Button("Submit")
                     # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
                     _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
                 with gr.Row():
                     duration = gr.Slider(minimum=1, maximum=120, value=10, label="Duration", interactive=True)
                 with gr.Row():
                     topp = gr.Number(label="Top-p", value=0, interactive=True)
                     temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
                     cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
             with gr.Column():
+                with gr.Row():
+                    output_normal = gr.Video(label="Generated Music")
+                with gr.Row():
+                    output_without_drum = gr.Video(label="Removed drums")
         submit.click(predict_full,
+                     inputs=[text, melody, duration, topk, topp, temperature, cfg_coef],
+                     outputs=[output_normal, output_without_drum])
         radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
         gr.Markdown(
             """
             An overlap of 12 seconds is kept with the previously generated chunk, and 18 "new" seconds
             are generated each time.
             """
         )
         submit.click(predict_batched, inputs=[text, melody],
                      outputs=[output], batch=True, max_batch_size=MAX_BATCH_SIZE)
         radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
         gr.Markdown("""
         ### More details
     if args.share:
         launch_kwargs['share'] = args.share
+    # Load melody model
+    load_model()
     # Show the interface
     if IS_BATCHED:
         ui_batched(launch_kwargs)