sanchit-gandhi HF staff commited on
Commit
29309b0
1 Parent(s): e9c24a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -2
app.py CHANGED
@@ -15,6 +15,38 @@ import spaces
15
  model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
16
  processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small")
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  class MusicgenStreamer(BaseStreamer):
20
  def __init__(
@@ -181,12 +213,24 @@ demo = gr.Interface(
181
  fn=generate_audio,
182
  inputs=[
183
  gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
184
- gr.Slider(10, 30, value=15, step=5, label="Audio length in s"),
185
- gr.Slider(2, 10, value=2, step=2, label="Streaming interval in s"),
 
186
  ],
187
  outputs=[
188
  gr.Audio(label="Generated Music", streaming=True, autoplay=True)
189
  ],
 
 
 
 
 
 
 
 
 
 
190
  )
191
 
 
192
  demo.queue().launch()
 
15
  model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
16
  processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small")
17
 
18
+ title = "MusicGen Streaming"
19
+
20
+ description = """
21
+ Stream the outputs of the MusicGen text-to-music model by playing the generated audio as soon as the first chunk is ready.
22
+ Demo uses [MusicGen Small](https://huggingface.co/facebook/musicgen-small) in the 🤗 Transformers library.
23
+ """
24
+
25
+ article = """
26
+ ## How Does It Work?
27
+
28
+ MusicGen is an auto-regressive transformer-based model, meaning generates audio codes (tokens) in a causal fashion.
29
+ At each decoding step, the model generates a new set of audio codes, conditional on the text input and all previous audio codes. From the
30
+ frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
31
+ each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
32
+ 20 seconds of audio.
33
+
34
+ Rather than waiting for the entire audio sequence to be generated, which would require the full 1000 decoding steps, we can start
35
+ playing the audio after a specified number of decoding steps have been reached, a techinque known as [*streaming*](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming).
36
+ For example, after 250 steps we have the first 5 seconds of audio ready, and so can play this without waiting for the remaining
37
+ 750 decoding steps to be complete. As we continue to generate with the MusicGen model, we append new chunks of generated audio
38
+ to our output waveform on-the-fly. After the full 1000 decoding steps, the generated audio is complete, and is composed of four
39
+ chunks of audio, each corresponding to 250 tokens.
40
+
41
+ This method of playing incremental generations reduces the latency of the MusicGen model from the total time to generate 1000 tokens,
42
+ to the time taken to play the first chunk of audio (250 tokens). This can result in significant improvements to perceived latency,
43
+ particularly when the chunk size is chosen to be small. In practice, the chunk size should be tuned to your device: using a
44
+ smaller chunk size will mean that the first chunk is ready faster, but should not be chosen so small that the model generates slower
45
+ than the time it takes to play the audio.
46
+
47
+ For details on how the streaming class works, check out the source code for the [MusicgenStreamer](https://huggingface.co/spaces/sanchit-gandhi/musicgen-streaming/blob/main/app.py#L51).
48
+ """
49
+
50
 
51
  class MusicgenStreamer(BaseStreamer):
52
  def __init__(
 
213
  fn=generate_audio,
214
  inputs=[
215
  gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
216
+ gr.Slider(10, 30, value=15, step=5, label="Audio length in seconds"),
217
+ gr.Slider(0.5, 2.5, value=0.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
218
+ gr.Slider(0, 10, value=5, step=1, label="Seed for random generations"),
219
  ],
220
  outputs=[
221
  gr.Audio(label="Generated Music", streaming=True, autoplay=True)
222
  ],
223
+ examples=[
224
+ ["An 80s driving pop song with heavy drums and synth pads in the background", 20, 0.5, 5],
225
+ ["A cheerful country song with acoustic guitars", 15, 0.5, 5],
226
+ ["90s rock song with electric guitar and heavy drums", 15, 0.5, 5],
227
+ ["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", 30, 0.5, 5],
228
+ ["lofi slow bpm electro chill with organic samples", 30, 0.5, 5],
229
+ ],
230
+ title=title,
231
+ description=description,
232
+ article=article,
233
  )
234
 
235
+
236
  demo.queue().launch()