ai-tube-model-musicgen-4

Paused

App Files Files Community

jbilcke-hf HF Staff commited on Dec 11, 2023

Commit

243ff9b

1 Parent(s): 66eea88

Update demos/musicgen_app.py

Browse files

Files changed (1) hide show

demos/musicgen_app.py +14 -102

demos/musicgen_app.py CHANGED Viewed

@@ -32,8 +32,7 @@ SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret')
 MODEL = None  # Last used model
 SPACE_ID = os.environ.get('SPACE_ID', '')
-IS_BATCHED = "facebook/MusicGen" in SPACE_ID or 'musicgen-internal/musicgen_dev' in SPACE_ID
-print(IS_BATCHED)
 MAX_BATCH_SIZE = 12
 BATCHED_DURATION = 15
 INTERRUPTING = False
@@ -82,17 +81,6 @@ class FileCleaner:
 file_cleaner = FileCleaner()
-def make_waveform(*args, **kwargs):
-    # Further remove some warnings.
-    be = time.time()
-    with warnings.catch_warnings():
-        warnings.simplefilter('ignore')
-        out = gr.make_waveform(*args, **kwargs)
-        print("Make a video took", time.time() - be)
-        return out
 def load_model(version='facebook/musicgen-melody'):
     global MODEL
     print("Loading model", version)
@@ -153,30 +141,25 @@ def _do_predictions(texts, melodies, duration, progress=False, gradio_progress=N
             outputs_diffusion = rearrange(outputs_diffusion, '(s b) c t -> b (s c) t', s=2)
         outputs = torch.cat([outputs[0], outputs_diffusion], dim=0)
     outputs = outputs.detach().cpu().float()
-    pending_videos = []
     out_wavs = []
     for output in outputs:
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
             audio_write(
                 file.name, output, MODEL.sample_rate, strategy="loudness",
                 loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
-            pending_videos.append(pool.submit(make_waveform, file.name))
             out_wavs.append(file.name)
             file_cleaner.add(file.name)
-    out_videos = [pending_video.result() for pending_video in pending_videos]
-    for video in out_videos:
-        file_cleaner.add(video)
     print("batch finished", len(texts), time.time() - be)
     print("Tempfiles currently stored: ", len(file_cleaner.files))
-    return out_videos, out_wavs
 def predict_batched(texts, melodies):
     max_text_length = 512
     texts = [text[:max_text_length] for text in texts]
     load_model('facebook/musicgen-stereo-melody')
-    res = _do_predictions(texts, melodies, BATCHED_DURATION)
-    return res
 def predict_full(secret_token, model, model_path, decoder, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
@@ -222,14 +205,13 @@ def predict_full(secret_token, model, model_path, decoder, text, melody, duratio
             raise gr.Error("Interrupted.")
     MODEL.set_custom_progress_callback(_progress)
-    videos, wavs = _do_predictions(
         [text], [melody], duration, progress=True,
         top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef,
         gradio_progress=progress)
     if USE_DIFFUSION:
-        return videos[0], wavs[0], videos[1], wavs[1]
-    return videos[0], wavs[0], None, None
 def toggle_audio_src(choice):
     if choice == "mic":
@@ -240,9 +222,9 @@ def toggle_audio_src(choice):
 def toggle_diffusion(choice):
     if choice == "MultiBand_Diffusion":
-        return [gr.update(visible=True)] * 2
     else:
-        return [gr.update(visible=False)] * 2
 def ui_full(launch_kwargs):
@@ -292,14 +274,12 @@ def ui_full(launch_kwargs):
                     temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
                     cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
             with gr.Column():
-                output = gr.Video(label="Generated Music")
                 audio_output = gr.Audio(label="Generated Music (wav)", type='filepath')
-                diffusion_output = gr.Video(label="MultiBand Diffusion Decoder")
                 audio_diffusion = gr.Audio(label="MultiBand Diffusion Decoder (wav)", type='filepath')
-        submit.click(toggle_diffusion, decoder, [diffusion_output, audio_diffusion], queue=False,
                      show_progress=False).then(predict_full, inputs=[secret_token, model, model_path, decoder, text, melody, duration, topk, topp,
                                                                      temperature, cfg_coef],
-                                               outputs=[output, audio_output, diffusion_output, audio_diffusion])
         radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
         gr.Markdown(
@@ -349,71 +329,6 @@ def ui_full(launch_kwargs):
         interface.queue().launch(**launch_kwargs)
-def ui_batched(launch_kwargs):
-    with gr.Blocks() as demo:
-        gr.Markdown(
-            """
-            # MusicGen
-            This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft/blob/main/docs/MUSICGEN.md),
-            a simple and controllable model for music generation
-            presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
-            <br/>
-            <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
-                style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
-            <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
-                src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-            for longer sequences, more control and no queue.</p>
-            """
-        )
-        with gr.Row():
-            with gr.Column():
-                with gr.Row():
-                    text = gr.Text(label="Describe your music", lines=2, interactive=True)
-                    with gr.Column():
-                        radio = gr.Radio(["file", "mic"], value="file",
-                                         label="Condition on a melody (optional) File or Mic")
-                        melody = gr.Audio(source="upload", type="numpy", label="File",
-                                          interactive=True, elem_id="melody-input")
-                with gr.Row():
-                    submit = gr.Button("Generate")
-            with gr.Column():
-                output = gr.Video(label="Generated Music")
-                audio_output = gr.Audio(label="Generated Music (wav)", type='filepath')
-        submit.click(predict_batched, inputs=[text, melody],
-                     outputs=[output, audio_output], batch=True, max_batch_size=MAX_BATCH_SIZE)
-        radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
-        gr.Markdown("""
-        ### More details
-        The model will generate 15 seconds of audio based on the description you provided.
-        The model was trained with description from a stock music catalog, descriptions that will work best
-        should include some level of details on the instruments present, along with some intended use case
-        (e.g. adding "perfect for a commercial" can somehow help).
-        You can optionally provide a reference audio from which a broad melody will be extracted.
-        The model will then try to follow both the description and melody provided.
-        For best results, the melody should be 30 seconds long (I know, the samples we provide are not...)
-        You can access more control (longer generation, more models etc.) by clicking
-        the <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
-                style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
-            <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
-                src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-        (you will then need a paid GPU from HuggingFace).
-        If you have a GPU, you can run the gradio demo locally (click the link to our repo below for more info).
-        Finally, you can get a GPU for free from Google
-        and run the demo in [a Google Colab.](https://ai.honu.io/red/musicgen-colab).
-        See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft/blob/main/docs/MUSICGEN.md)
-        for more details. All samples are generated with the `stereo-melody` model.
-        """)
-        demo.queue(max_size=8 * 4).launch(**launch_kwargs)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -458,9 +373,6 @@ if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO, stream=sys.stderr)
     # Show the interface
-    if IS_BATCHED:
-        global USE_DIFFUSION
-        USE_DIFFUSION = False
-        ui_batched(launch_kwargs)
-    else:
-        ui_full(launch_kwargs)

 MODEL = None  # Last used model
 SPACE_ID = os.environ.get('SPACE_ID', '')
+IS_BATCHED = False # <- we hardcode it
 MAX_BATCH_SIZE = 12
 BATCHED_DURATION = 15
 INTERRUPTING = False
 file_cleaner = FileCleaner()
 def load_model(version='facebook/musicgen-melody'):
     global MODEL
     print("Loading model", version)
             outputs_diffusion = rearrange(outputs_diffusion, '(s b) c t -> b (s c) t', s=2)
         outputs = torch.cat([outputs[0], outputs_diffusion], dim=0)
     outputs = outputs.detach().cpu().float()
     out_wavs = []
     for output in outputs:
         with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
             audio_write(
                 file.name, output, MODEL.sample_rate, strategy="loudness",
                 loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
             out_wavs.append(file.name)
             file_cleaner.add(file.name)
     print("batch finished", len(texts), time.time() - be)
     print("Tempfiles currently stored: ", len(file_cleaner.files))
+    return out_wavs
 def predict_batched(texts, melodies):
     max_text_length = 512
     texts = [text[:max_text_length] for text in texts]
     load_model('facebook/musicgen-stereo-melody')
+    return _do_predictions(texts, melodies, BATCHED_DURATION)
 def predict_full(secret_token, model, model_path, decoder, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
             raise gr.Error("Interrupted.")
     MODEL.set_custom_progress_callback(_progress)
+    wavs = _do_predictions(
         [text], [melody], duration, progress=True,
         top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef,
         gradio_progress=progress)
     if USE_DIFFUSION:
+        return wavs[1]
+    return wavs[0]
 def toggle_audio_src(choice):
     if choice == "mic":
 def toggle_diffusion(choice):
     if choice == "MultiBand_Diffusion":
+        return [gr.update(visible=True)]
     else:
+        return [gr.update(visible=False)]
 def ui_full(launch_kwargs):
                     temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
                     cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
             with gr.Column():
                 audio_output = gr.Audio(label="Generated Music (wav)", type='filepath')
                 audio_diffusion = gr.Audio(label="MultiBand Diffusion Decoder (wav)", type='filepath')
+        submit.click(toggle_diffusion, decoder, [audio_diffusion], queue=False,
                      show_progress=False).then(predict_full, inputs=[secret_token, model, model_path, decoder, text, melody, duration, topk, topp,
                                                                      temperature, cfg_coef],
+                                               outputs=[audio_output])
         radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
         gr.Markdown(
         interface.queue().launch(**launch_kwargs)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
     logging.basicConfig(level=logging.INFO, stream=sys.stderr)
     # Show the interface
+    # we preload the model to avoid a timeout on the first request
+    load_model('facebook/musicgen-stereo-large')
+    ui_full(launch_kwargs)