ai-tube-model-musicgen-1

Paused

App Files Files Community

adefossez commited on Jun 13, 2023

Commit

6457900

•

1 Parent(s): 4cf6900

plop

Browse files

Files changed (3) hide show

README.md +6 -5
app.py +25 -10
audiocraft/models/musicgen.py +6 -2

README.md CHANGED Viewed

@@ -38,11 +38,12 @@ pip install -e .  # or if you cloned the repo locally
 ## Usage
 We offer a number of way to interact with MusicGen:
-1. You can play with MusicGen by running the jupyter notebook at [`demo.ipynb`](./demo.ipynb) locally, or use the provided [colab notebook](https://colab.research.google.com/drive/1fxGqfg96RBUvGxZ1XXN07s3DthrKUl4-?usp=sharing).
-2. You can use the gradio demo locally by running `python app.py`.
-3. A demo is also available on the [`facebook/MusicGen`  HuggingFace Space](https://huggingface.co/spaces/facebook/MusicGen) (huge thanks to all the HF team for their support).
-4. Finally, you can run the [Gradio demo with a Colab GPU](https://colab.research.google.com/drive/1-Xe9NCdIs2sCUbiSmwHXozK6AAhMm7_i?usp=sharing),
-as adapted from [@camenduru Colab](https://github.com/camenduru/MusicGen-colab).
 ## API

 ## Usage
 We offer a number of way to interact with MusicGen:
+1. A demo is also available on the [`facebook/MusicGen`  HuggingFace Space](https://huggingface.co/spaces/facebook/MusicGen) (huge thanks to all the HF team for their support).
+2. You can run the extended demo on a Colab: [colab notebook](https://colab.research.google.com/drive/1fxGqfg96RBUvGxZ1XXN07s3DthrKUl4-?usp=sharing).
+3. You can use the gradio demo locally by running `python app.py`.
+4. You can play with MusicGen by running the jupyter notebook at [`demo.ipynb`](./demo.ipynb) locally (if you have a GPU).
+5. Finally, checkout [@camenduru Colab page](https://github.com/camenduru/MusicGen-colab) which is regularly
+  updated with contributions from @camenduru and the community.
 ## API

app.py CHANGED Viewed

@@ -4,6 +4,9 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import argparse
 from concurrent.futures import ProcessPoolExecutor
 import os
@@ -22,8 +25,9 @@ from audiocraft.models import MusicGen
 MODEL = None  # Last used model
 IS_BATCHED = "facebook/MusicGen" in os.environ.get('SPACE_ID', '')
-MAX_BATCH_SIZE = 12
 BATCHED_DURATION = 15
 # We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
 _old_call = sp.call
@@ -37,10 +41,14 @@ def _call_nostderr(*args, **kwargs):
 sp.call = _call_nostderr
 # Preallocating the pool of processes.
-pool = ProcessPoolExecutor(3)
 pool.__enter__()
 def make_waveform(*args, **kwargs):
     # Further remove some warnings.
     be = time.time()
@@ -59,9 +67,6 @@ def load_model(version='melody'):
 def _do_predictions(texts, melodies, duration, **gen_kwargs):
-    if duration > MODEL.lm.cfg.dataset.segment_duration:
-        raise gr.Error("MusicGen currently supports durations of up to 30 seconds!")
     MODEL.set_generation_params(duration=duration, **gen_kwargs)
     print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
     be = time.time()
@@ -84,10 +89,10 @@ def _do_predictions(texts, melodies, duration, **gen_kwargs):
             descriptions=texts,
             melody_wavs=processed_melodies,
             melody_sample_rate=target_sr,
-            progress=False
         )
     else:
-        outputs = MODEL.generate(texts, progress=False)
     outputs = outputs.detach().cpu().float()
     out_files = []
@@ -110,9 +115,16 @@ def predict_batched(texts, melodies):
     return [res]
-def predict_full(model, text, melody, duration, topk, topp, temperature, cfg_coef):
     topk = int(topk)
     load_model(model)
     outs = _do_predictions(
         [text], [melody], duration,
@@ -136,6 +148,8 @@ def ui_full(launch_kwargs):
                     melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True)
                 with gr.Row():
                     submit = gr.Button("Submit")
                 with gr.Row():
                     model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
                 with gr.Row():
@@ -190,7 +204,8 @@ def ui_full(launch_kwargs):
             This can take a long time, and the model might lose consistency. The model might also
             decide at arbitrary positions that the song ends.
-            **WARNING:** Choosing long durations will take a long time to generate (2min might take ~10min).
             We present 4 model variations:
             1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
@@ -207,7 +222,7 @@ def ui_full(launch_kwargs):
             """
         )
-        interface.queue().launch(**launch_kwargs, max_threads=1)
 def ui_batched(launch_kwargs):

 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+# Updated to account for UI changes from https://github.com/rkfg/audiocraft/blob/long/app.py
+# also released under the MIT license.
 import argparse
 from concurrent.futures import ProcessPoolExecutor
 import os
 MODEL = None  # Last used model
 IS_BATCHED = "facebook/MusicGen" in os.environ.get('SPACE_ID', '')
+MAX_BATCH_SIZE = 8
 BATCHED_DURATION = 15
+INTERRUPTING = False
 # We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
 _old_call = sp.call
 sp.call = _call_nostderr
 # Preallocating the pool of processes.
+pool = ProcessPoolExecutor(4)
 pool.__enter__()
+def interrupt():
+    global INTERRUPTING
+    INTERRUPTING = True
 def make_waveform(*args, **kwargs):
     # Further remove some warnings.
     be = time.time()
 def _do_predictions(texts, melodies, duration, **gen_kwargs):
     MODEL.set_generation_params(duration=duration, **gen_kwargs)
     print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
     be = time.time()
             descriptions=texts,
             melody_wavs=processed_melodies,
             melody_sample_rate=target_sr,
+            progress=True
         )
     else:
+        outputs = MODEL.generate(texts, progress=True)
     outputs = outputs.detach().cpu().float()
     out_files = []
     return [res]
+def predict_full(model, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
+    global INTERRUPTING
+    INTERRUPTING = False
     topk = int(topk)
     load_model(model)
+    def _progress(generated, to_generate):
+        progress((generated, to_generate))
+        if INTERRUPTING:
+            raise gr.Error("Interrupted.")
+    MODEL.set_custom_progress_callback(_progress)
     outs = _do_predictions(
         [text], [melody], duration,
                     melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True)
                 with gr.Row():
                     submit = gr.Button("Submit")
+                    # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
+                    _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
                 with gr.Row():
                     model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
                 with gr.Row():
             This can take a long time, and the model might lose consistency. The model might also
             decide at arbitrary positions that the song ends.
+            **WARNING:** Choosing long durations will take a long time to generate (2min might take ~10min). An overlap of 12 seconds
+            is kept with the previously generated chunk, and 18 "new" seconds are generated each time.
             We present 4 model variations:
             1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
             """
         )
+        interface.queue().launch(**launch_kwargs)
 def ui_batched(launch_kwargs):

audiocraft/models/musicgen.py CHANGED Viewed

@@ -99,7 +99,7 @@ class MusicGen:
     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
                               top_p: float = 0.0, temperature: float = 1.0,
                               duration: float = 30.0, cfg_coef: float = 3.0,
-                              two_step_cfg: bool = False, extend_stride: float = 15):
         """Set the generation parameters for MusicGen.
         Args:
@@ -129,6 +129,7 @@ class MusicGen:
         }
     def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
         self._progress_callback = progress_callback
     def generate_unconditional(self, num_samples: int, progress: bool = False) -> torch.Tensor:
@@ -280,9 +281,11 @@ class MusicGen:
         def _progress_callback(generated_tokens: int, tokens_to_generate: int):
             generated_tokens += current_gen_offset
             if self._progress_callback is not None:
                 self._progress_callback(generated_tokens, total_gen_len)
             else:
-            print(f'{current_gen_offset + generated_tokens: 6d} / {total_gen_len: 6d}', end='\r')
         if prompt_tokens is not None:
             assert max_prompt_len >= prompt_tokens.shape[-1], \
@@ -326,6 +329,7 @@ class MusicGen:
                     # we wouldn't have the full wav.
                     initial_position = int(time_offset * self.sample_rate)
                     wav_target_length = int(self.max_duration * self.sample_rate)
                     positions = torch.arange(initial_position,
                                              initial_position + wav_target_length, device=self.device)
                     attr.wav['self_wav'] = WavCondition(

     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
                               top_p: float = 0.0, temperature: float = 1.0,
                               duration: float = 30.0, cfg_coef: float = 3.0,
+                              two_step_cfg: bool = False, extend_stride: float = 18):
         """Set the generation parameters for MusicGen.
         Args:
         }
     def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+        """Override the default progress callback."""
         self._progress_callback = progress_callback
     def generate_unconditional(self, num_samples: int, progress: bool = False) -> torch.Tensor:
         def _progress_callback(generated_tokens: int, tokens_to_generate: int):
             generated_tokens += current_gen_offset
             if self._progress_callback is not None:
+                # Note that total_gen_len might be quite wrong depending on the
+                # codebook pattern used, but with delay it is almost accurate.
                 self._progress_callback(generated_tokens, total_gen_len)
             else:
+                print(f'{generated_tokens: 6d} / {total_gen_len: 6d}', end='\r')
         if prompt_tokens is not None:
             assert max_prompt_len >= prompt_tokens.shape[-1], \
                     # we wouldn't have the full wav.
                     initial_position = int(time_offset * self.sample_rate)
                     wav_target_length = int(self.max_duration * self.sample_rate)
+                    print(initial_position / self.sample_rate, wav_target_length / self.sample_rate)
                     positions = torch.arange(initial_position,
                                              initial_position + wav_target_length, device=self.device)
                     attr.wav['self_wav'] = WavCondition(