longform-musicgen

Running on Zero

App Files Files Community

ylacombe commited on Apr 22

Commit

52cf258

•

1 Parent(s): c12ffff

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -18

app.py CHANGED Viewed

@@ -263,7 +263,7 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
         max_new_tokens = generation_config.max_new_tokens
-        while current_generated_length + 20 <= max_longform_generation_length:
             generation_config.max_new_tokens = min(max_new_tokens, max_longform_generation_length - current_generated_length)
             if is_greedy_gen_mode:
                 if generation_config.num_return_sequences > 1:
@@ -378,7 +378,7 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
         # Specific to this gradio demo
         if streamer is not None:
-            streamer.end(True)
         audio_scales = model_kwargs.get("audio_scales")
         if audio_scales is None:
@@ -414,7 +414,7 @@ title = "Streaming Long-form MusicGen"
 description = """
 Stream the outputs of the MusicGen Melody text-to-music model by playing the generated audio as soon as the first chunk is ready.
-The generation loop is adapted to perform **long-form** music generation. In this demo, we limit the duration of the music generated, but in theory, it could run **endlessly**.
 Demo uses [MusicGen Melody](https://huggingface.co/facebook/musicgen-melody) in the 🤗 Transformers library. Note that the
 demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
@@ -468,6 +468,7 @@ class MusicgenStreamer(BaseStreamer):
         stride: Optional[int] = None,
         timeout: Optional[float] = None,
         is_longform: Optional[bool] = False,
     ):
         """
         Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
@@ -496,6 +497,7 @@ class MusicgenStreamer(BaseStreamer):
         self.audio_encoder = model.audio_encoder
         self.generation_config = model.generation_config
         self.device = device if device is not None else model.device
         # variables used in the streaming process
         self.play_steps = play_steps
@@ -509,6 +511,8 @@ class MusicgenStreamer(BaseStreamer):
         self.is_longform = is_longform
         # varibles used in the thread process
         self.audio_queue = Queue()
         self.stop_signal = None
@@ -565,19 +569,19 @@ class MusicgenStreamer(BaseStreamer):
         if self.token_cache.shape[-1] % self.play_steps == 0:
             audio_values = self.apply_delay_pattern_mask(self.token_cache)
-            if self.to_yield != len(audio_values) - self.stride:
-                self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
-                self.to_yield += len(audio_values) - self.to_yield - self.stride
-    def end(self, stream_end=False):
         """Flushes any remaining cache and appends the stop symbol."""
         if self.token_cache is not None:
             audio_values = self.apply_delay_pattern_mask(self.token_cache)
         else:
             audio_values = np.zeros(self.to_yield)
-        stream_end = (not self.is_longform) or stream_end
-        self.on_finalized_audio(audio_values[self.to_yield :], stream_end=stream_end)
     def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
         """Put the new audio in the queue. If the stream is ending, also put a stop signal in the queue."""
@@ -618,8 +622,10 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
     return wav_buf.read()
 @spaces.GPU(duration=90)
-def generate_audio(text_prompt, audio, audio_length_in_s=10.0, play_steps_in_s=2.0, seed=0):
     max_new_tokens = int(frame_rate * audio_length_in_s)
     play_steps = int(frame_rate * play_steps_in_s)
     if audio is not None:
@@ -649,7 +655,8 @@ def generate_audio(text_prompt, audio, audio_length_in_s=10.0, play_steps_in_s=2
             return_tensors="pt",
         )
-    streamer = MusicgenStreamer(model, device=device, play_steps=play_steps, is_longform=True)
     generation_kwargs = dict(
         **inputs.to(device),
@@ -678,19 +685,17 @@ demo = gr.Interface(
     inputs=[
         gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
         gr.Audio(type="filepath", label="Conditioning audio. Use this for melody-guided generation."),
-        gr.Slider(30, 60, value=45, step=5, label="(Approximate) Audio length in seconds."),
-        gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Streaming interval in seconds.", info="Lower = shorter chunks, lower latency, more codec steps."),
         gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations."),
     ],
     outputs=[
         gr.Audio(label="Generated Music", autoplay=True,  interactive=False, streaming=True)
     ],
     examples=[
-        ["An 80s driving pop song with heavy drums and synth pads in the background", None, 45, 1.5, 5],
-        ["Bossa nova with guitars and synthesizer", "./assets/assets_bolero_ravel.mp3", 45, 1.5, 5],
-        ["90s rock song with electric guitar and heavy drums", "./assets/assets_bach.mp3", 45, 1.5, 5],
-        ["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", None, 45, 1.5, 5],
-        ["lofi slow bpm electro chill with organic samples", None, 45, 1.5, 5],
     ],
     title=title,
     description=description,

         max_new_tokens = generation_config.max_new_tokens
+        while current_generated_length + 4 <= max_longform_generation_length:
             generation_config.max_new_tokens = min(max_new_tokens, max_longform_generation_length - current_generated_length)
             if is_greedy_gen_mode:
                 if generation_config.num_return_sequences > 1:
         # Specific to this gradio demo
         if streamer is not None:
+            streamer.end(final_end=True)
         audio_scales = model_kwargs.get("audio_scales")
         if audio_scales is None:
 description = """
 Stream the outputs of the MusicGen Melody text-to-music model by playing the generated audio as soon as the first chunk is ready.
+The generation loop is adapted to perform **long-form** music generation. In this demo, we limit the duration of the music generated to 1mn20, but in theory, it could run **endlessly**.
 Demo uses [MusicGen Melody](https://huggingface.co/facebook/musicgen-melody) in the 🤗 Transformers library. Note that the
 demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
         stride: Optional[int] = None,
         timeout: Optional[float] = None,
         is_longform: Optional[bool] = False,
+        longform_stride: Optional[float] = 10,
     ):
         """
         Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
         self.audio_encoder = model.audio_encoder
         self.generation_config = model.generation_config
         self.device = device if device is not None else model.device
+        self.longform_stride = longform_stride
         # variables used in the streaming process
         self.play_steps = play_steps
         self.is_longform = is_longform
+        self.previous_len = -1
         # varibles used in the thread process
         self.audio_queue = Queue()
         self.stop_signal = None
         if self.token_cache.shape[-1] % self.play_steps == 0:
             audio_values = self.apply_delay_pattern_mask(self.token_cache)
+            self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
+            self.to_yield = len(audio_values) - self.stride
+            self.previous_len = len(audio_values)
+    def end(self, stream_end=False, final_end=False):
         """Flushes any remaining cache and appends the stop symbol."""
         if self.token_cache is not None:
             audio_values = self.apply_delay_pattern_mask(self.token_cache)
         else:
             audio_values = np.zeros(self.to_yield)
+        if final_end:
+            self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
     def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
         """Put the new audio in the queue. If the stream is ending, also put a stop signal in the queue."""
     return wav_buf.read()
 @spaces.GPU(duration=90)
+def generate_audio(text_prompt, audio, seed=0):
+    audio_length_in_s = 60
     max_new_tokens = int(frame_rate * audio_length_in_s)
+    play_steps_in_s = 2.0
     play_steps = int(frame_rate * play_steps_in_s)
     if audio is not None:
             return_tensors="pt",
         )
+    streamer = MusicgenStreamer(model, device=device, play_steps=play_steps, is_longform=True,
+                                longform_stride=15*32000)
     generation_kwargs = dict(
         **inputs.to(device),
     inputs=[
         gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
         gr.Audio(type="filepath", label="Conditioning audio. Use this for melody-guided generation."),
         gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations."),
     ],
     outputs=[
         gr.Audio(label="Generated Music", autoplay=True,  interactive=False, streaming=True)
     ],
     examples=[
+        ["An 80s driving pop song with heavy drums and synth pads in the background", None, 5],
+        ["Bossa nova with guitars and synthesizer", "./assets/assets_bolero_ravel.mp3", 5],
+        ["90s rock song with electric guitar and heavy drums", "./assets/assets_bach.mp3", 5],
+        ["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", None, 5],
+        ["lofi slow bpm electro chill with organic samples", None, 5],
     ],
     title=title,
     description=description,