Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -263,7 +263,7 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
|
|
263 |
|
264 |
max_new_tokens = generation_config.max_new_tokens
|
265 |
|
266 |
-
while current_generated_length +
|
267 |
generation_config.max_new_tokens = min(max_new_tokens, max_longform_generation_length - current_generated_length)
|
268 |
if is_greedy_gen_mode:
|
269 |
if generation_config.num_return_sequences > 1:
|
@@ -378,7 +378,7 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
|
|
378 |
|
379 |
# Specific to this gradio demo
|
380 |
if streamer is not None:
|
381 |
-
streamer.end(True)
|
382 |
|
383 |
audio_scales = model_kwargs.get("audio_scales")
|
384 |
if audio_scales is None:
|
@@ -414,7 +414,7 @@ title = "Streaming Long-form MusicGen"
|
|
414 |
description = """
|
415 |
Stream the outputs of the MusicGen Melody text-to-music model by playing the generated audio as soon as the first chunk is ready.
|
416 |
|
417 |
-
The generation loop is adapted to perform **long-form** music generation. In this demo, we limit the duration of the music generated, but in theory, it could run **endlessly**.
|
418 |
|
419 |
Demo uses [MusicGen Melody](https://huggingface.co/facebook/musicgen-melody) in the 🤗 Transformers library. Note that the
|
420 |
demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
|
@@ -468,6 +468,7 @@ class MusicgenStreamer(BaseStreamer):
|
|
468 |
stride: Optional[int] = None,
|
469 |
timeout: Optional[float] = None,
|
470 |
is_longform: Optional[bool] = False,
|
|
|
471 |
):
|
472 |
"""
|
473 |
Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
|
@@ -496,6 +497,7 @@ class MusicgenStreamer(BaseStreamer):
|
|
496 |
self.audio_encoder = model.audio_encoder
|
497 |
self.generation_config = model.generation_config
|
498 |
self.device = device if device is not None else model.device
|
|
|
499 |
|
500 |
# variables used in the streaming process
|
501 |
self.play_steps = play_steps
|
@@ -509,6 +511,8 @@ class MusicgenStreamer(BaseStreamer):
|
|
509 |
|
510 |
self.is_longform = is_longform
|
511 |
|
|
|
|
|
512 |
# varibles used in the thread process
|
513 |
self.audio_queue = Queue()
|
514 |
self.stop_signal = None
|
@@ -565,19 +569,19 @@ class MusicgenStreamer(BaseStreamer):
|
|
565 |
|
566 |
if self.token_cache.shape[-1] % self.play_steps == 0:
|
567 |
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
|
572 |
-
def end(self, stream_end=False):
|
573 |
"""Flushes any remaining cache and appends the stop symbol."""
|
574 |
if self.token_cache is not None:
|
575 |
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
576 |
else:
|
577 |
audio_values = np.zeros(self.to_yield)
|
578 |
|
579 |
-
|
580 |
-
|
581 |
|
582 |
def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
|
583 |
"""Put the new audio in the queue. If the stream is ending, also put a stop signal in the queue."""
|
@@ -618,8 +622,10 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
|
|
618 |
return wav_buf.read()
|
619 |
|
620 |
@spaces.GPU(duration=90)
|
621 |
-
def generate_audio(text_prompt, audio,
|
|
|
622 |
max_new_tokens = int(frame_rate * audio_length_in_s)
|
|
|
623 |
play_steps = int(frame_rate * play_steps_in_s)
|
624 |
|
625 |
if audio is not None:
|
@@ -649,7 +655,8 @@ def generate_audio(text_prompt, audio, audio_length_in_s=10.0, play_steps_in_s=2
|
|
649 |
return_tensors="pt",
|
650 |
)
|
651 |
|
652 |
-
streamer = MusicgenStreamer(model, device=device, play_steps=play_steps, is_longform=True
|
|
|
653 |
|
654 |
generation_kwargs = dict(
|
655 |
**inputs.to(device),
|
@@ -678,19 +685,17 @@ demo = gr.Interface(
|
|
678 |
inputs=[
|
679 |
gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
|
680 |
gr.Audio(type="filepath", label="Conditioning audio. Use this for melody-guided generation."),
|
681 |
-
gr.Slider(30, 60, value=45, step=5, label="(Approximate) Audio length in seconds."),
|
682 |
-
gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Streaming interval in seconds.", info="Lower = shorter chunks, lower latency, more codec steps."),
|
683 |
gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations."),
|
684 |
],
|
685 |
outputs=[
|
686 |
gr.Audio(label="Generated Music", autoplay=True, interactive=False, streaming=True)
|
687 |
],
|
688 |
examples=[
|
689 |
-
["An 80s driving pop song with heavy drums and synth pads in the background", None,
|
690 |
-
["Bossa nova with guitars and synthesizer", "./assets/assets_bolero_ravel.mp3",
|
691 |
-
["90s rock song with electric guitar and heavy drums", "./assets/assets_bach.mp3",
|
692 |
-
["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", None,
|
693 |
-
["lofi slow bpm electro chill with organic samples", None,
|
694 |
],
|
695 |
title=title,
|
696 |
description=description,
|
|
|
263 |
|
264 |
max_new_tokens = generation_config.max_new_tokens
|
265 |
|
266 |
+
while current_generated_length + 4 <= max_longform_generation_length:
|
267 |
generation_config.max_new_tokens = min(max_new_tokens, max_longform_generation_length - current_generated_length)
|
268 |
if is_greedy_gen_mode:
|
269 |
if generation_config.num_return_sequences > 1:
|
|
|
378 |
|
379 |
# Specific to this gradio demo
|
380 |
if streamer is not None:
|
381 |
+
streamer.end(final_end=True)
|
382 |
|
383 |
audio_scales = model_kwargs.get("audio_scales")
|
384 |
if audio_scales is None:
|
|
|
414 |
description = """
|
415 |
Stream the outputs of the MusicGen Melody text-to-music model by playing the generated audio as soon as the first chunk is ready.
|
416 |
|
417 |
+
The generation loop is adapted to perform **long-form** music generation. In this demo, we limit the duration of the music generated to 1mn20, but in theory, it could run **endlessly**.
|
418 |
|
419 |
Demo uses [MusicGen Melody](https://huggingface.co/facebook/musicgen-melody) in the 🤗 Transformers library. Note that the
|
420 |
demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
|
|
|
468 |
stride: Optional[int] = None,
|
469 |
timeout: Optional[float] = None,
|
470 |
is_longform: Optional[bool] = False,
|
471 |
+
longform_stride: Optional[float] = 10,
|
472 |
):
|
473 |
"""
|
474 |
Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
|
|
|
497 |
self.audio_encoder = model.audio_encoder
|
498 |
self.generation_config = model.generation_config
|
499 |
self.device = device if device is not None else model.device
|
500 |
+
self.longform_stride = longform_stride
|
501 |
|
502 |
# variables used in the streaming process
|
503 |
self.play_steps = play_steps
|
|
|
511 |
|
512 |
self.is_longform = is_longform
|
513 |
|
514 |
+
self.previous_len = -1
|
515 |
+
|
516 |
# varibles used in the thread process
|
517 |
self.audio_queue = Queue()
|
518 |
self.stop_signal = None
|
|
|
569 |
|
570 |
if self.token_cache.shape[-1] % self.play_steps == 0:
|
571 |
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
572 |
+
self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
|
573 |
+
self.to_yield = len(audio_values) - self.stride
|
574 |
+
self.previous_len = len(audio_values)
|
575 |
|
576 |
+
def end(self, stream_end=False, final_end=False):
|
577 |
"""Flushes any remaining cache and appends the stop symbol."""
|
578 |
if self.token_cache is not None:
|
579 |
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
580 |
else:
|
581 |
audio_values = np.zeros(self.to_yield)
|
582 |
|
583 |
+
if final_end:
|
584 |
+
self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
|
585 |
|
586 |
def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
|
587 |
"""Put the new audio in the queue. If the stream is ending, also put a stop signal in the queue."""
|
|
|
622 |
return wav_buf.read()
|
623 |
|
624 |
@spaces.GPU(duration=90)
|
625 |
+
def generate_audio(text_prompt, audio, seed=0):
|
626 |
+
audio_length_in_s = 60
|
627 |
max_new_tokens = int(frame_rate * audio_length_in_s)
|
628 |
+
play_steps_in_s = 2.0
|
629 |
play_steps = int(frame_rate * play_steps_in_s)
|
630 |
|
631 |
if audio is not None:
|
|
|
655 |
return_tensors="pt",
|
656 |
)
|
657 |
|
658 |
+
streamer = MusicgenStreamer(model, device=device, play_steps=play_steps, is_longform=True,
|
659 |
+
longform_stride=15*32000)
|
660 |
|
661 |
generation_kwargs = dict(
|
662 |
**inputs.to(device),
|
|
|
685 |
inputs=[
|
686 |
gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
|
687 |
gr.Audio(type="filepath", label="Conditioning audio. Use this for melody-guided generation."),
|
|
|
|
|
688 |
gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations."),
|
689 |
],
|
690 |
outputs=[
|
691 |
gr.Audio(label="Generated Music", autoplay=True, interactive=False, streaming=True)
|
692 |
],
|
693 |
examples=[
|
694 |
+
["An 80s driving pop song with heavy drums and synth pads in the background", None, 5],
|
695 |
+
["Bossa nova with guitars and synthesizer", "./assets/assets_bolero_ravel.mp3", 5],
|
696 |
+
["90s rock song with electric guitar and heavy drums", "./assets/assets_bach.mp3", 5],
|
697 |
+
["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", None, 5],
|
698 |
+
["lofi slow bpm electro chill with organic samples", None, 5],
|
699 |
],
|
700 |
title=title,
|
701 |
description=description,
|